# output:
# BiocStyle::pdf_document:
# toc: TRUE
#
# knitr::opts_chunk$set(echo=FALSE, error=FALSE, warning=FALSE,message=FALSE)
# tables
# replace html with latex
# regulatedEvents %>%
# slice_sample(n=10) %>%
# kable(., "html") %>%
# kableExtra::kable_styling("striped") %>%
# kableExtra::scroll_box(height="300px", width = "94%")
This report describes the quantification of KCl sensitive of RT-Stop sites in the in-vitro RT-Stop data sets. A detailed description of the initial RT-Stop site definition of KCl and NaCl is given in the reports RT_Stop_Sites_Kcl.Rmd and RT_Stop_Sites_NaCl.Rmd. We expect some of these peaks to change in their strength depending on the presence of NaCl or KCl. KCl is meant to stabilize G4 formation, therefore peaks should mostly go up when comparing to NaCl. Here we want to test which G4 peaks are sensitive for KCl and which are not.
### ============================================================================
### Load Packages
### ----------------------------------------------------------------------------
###
library(rtracklayer)
library(GenomicRanges)
library(ggplot2)
library(AnnotationDbi)
library(dplyr)
library(reshape2)
library(UpSetR)
library(GenomicFeatures)
library(kableExtra)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(viridis)
library(ComplexHeatmap)
library(forcats)
library(ggtext)
library(patchwork)
library(ggforce)
library(ggpointdensity)
library(knitr)
library(tidyr)
library(ggsci)
library(DESeq2)
library(gghalves)
library(ggrastr)
library(ggnewscale)
library(BindingSiteFinder)
library(xlsx)
library(ggseqlogo)
library(stringr)
source("/Users/mirko/Projects/GeneralScripts/CustomThemes.R")
Here I load the raw signal per construct as list and the related sets of peaks as GenomicRanges from RData files.
# Load RT-Stop data per construct
constructInfo = readRDS("../rds/constructInformation.rds")
constructInfo = constructInfo[!grepl("Spike",constructInfo$Region.ID),]
constructSignal = readRDS("../rds/RTstop.profiling.l.rds")
# Load KCl sites
load("../01_BindingSites/condition_kcl/mean_cl1/data/sitesReproducible.rda")
name = paste0(seqnames(sitesReproducible), "_", unlist(sapply(runLength(seqnames(sitesReproducible)), seq)))
names(sitesReproducible) = name
seqlevels(sitesReproducible) <- as.character(sprintf("%05d", 1:12000))
peaksKCL = sitesReproducible
# Load NaCl sites
load("../01_BindingSites/condition_nacl/data/sitesReproducible.rda")
name = paste0(seqnames(sitesReproducible), "_", unlist(sapply(runLength(seqnames(sitesReproducible)), seq)))
names(sitesReproducible) = name
seqlevels(sitesReproducible) <- as.character(sprintf("%05d", 1:12000))
peaksNACL = sitesReproducible
Here I combined both sets of peaks. To keep the desired width of 3nt I applied the merge and re-size routine on all peak midpoints. This essentially uses the combined coverage of all replicates in both conditions to decide on the center of any peak region that only partially overlaps in the two sets. In the end I annotate for each peak if it was seen in NaCl, KCl or both conditions.
# Resize bs to their center and combine over conditions
peaks = unique(c(resize(peaksKCL, fix = "center", width = 1),
resize(peaksNACL, fix = "center", width = 1)))
# Prepare BSF data object
meta = data.frame(id = 1:6, condition = c(rep("GTP_KCL",3), rep("GTP_NACL", 3)))
sgn = list(signalPlus = constructSignal[1:6])
names(sgn$signalPlus) = paste0(meta$id, "_", meta$condition)
sgn$signalMinus = constructSignal[1:6]
names(sgn$signalMinus) = paste0(meta$id, "_", meta$condition)
bds = BSFDataSet(ranges = peaks, meta = meta, signal = sgn, silent = TRUE)
# Final binding site settings
bdsMerge <- makeBindingSites(object = bds, bsSize = 3, minWidth = 0,
minCrosslinks = 2, minClSites = 1, centerIsSummit = FALSE)
df = myNumberFormat(getSummary(bdsMerge))
kable(df, "html", caption = "Merge and combine") %>%
kableExtra::kable_styling("striped") %>%
kableExtra::scroll_box(height="300px", width = "94%")
| Option | nRanges |
|---|---|
| inputRanges | 46,326 |
| mergeCrosslinkSites | 42,271 |
| minCrosslinks | 42,271 |
| minClSites | 42,271 |
| centerIsClSite | 42,239 |
| centerIsSummit | NA |
# kable(myNumberFormat(df), "latex", caption = "Merge and combine", booktabs = TRUE)
rng = BindingSiteFinder::getRanges(bdsMerge)
r1 = resize(peaksKCL, fix = "center", width = 1)
r2 = resize(peaksNACL, fix = "center", width = 1)
df = data.frame(olKCL = countOverlaps(rng, r1),
olNACL = countOverlaps(rng, r2))
m = make_comb_mat(df)
ha = HeatmapAnnotation(
"Intersections" = anno_barplot(comb_size(m),
border = FALSE,
gp = gpar(fill = "#a6a6a6"),
height = unit(6, "cm"))
)
ha2 = HeatmapAnnotation(
"set sizes" = anno_barplot(set_size(m),
border = FALSE,
gp = gpar(fill = "#a6a6a6"),
width = unit(4, "cm")),
which = "row"
)
ht = UpSet(m,
comb_order = order(comb_size(m), decreasing = T),
top_annotation = ha,
right_annotation = ha2,
comb_col = "#003399", bg_col = "white", pt_size = unit(.5, "cm") ,
border = T, lwd = 2, bg_pt_col = "#a6a6a6"
)
ss = set_size(m)
cs = comb_size(m)
ht = draw(ht, padding = unit(c(0, 0, 10, 0), "mm"))
od = column_order(ht)
decorate_annotation("Intersections", {
grid.text(format(cs[od], big.mark = ".", decimal.mark = ","),
x = seq_along(cs), y = unit(cs[od], "native") + unit(.1, "pt"),
default.units = "native", just = c("left", "bottom"),
gp = gpar(fontsize = 8, col = "black"), rot = 45)
})
Figure 1: Overlap between NaCl and KCl peaks
Overlaps were computed after resolving partial overlaps.
# Annotate with condition support
mcols(rng) = cbind(mcols(rng), df)
peaks_all = rng
# Set peak names
name = paste0(seqnames(peaks_all), "_", unlist(sapply(runLength(seqnames(peaks_all)), seq)))
names(peaks_all) = name
seqlevels(peaks_all) = as.character(sprintf("%05d", 1:12000))
# Export as bed file
rtracklayer::export(rng,
"./data/peaks_all.bed",
format = "BED",
trackLine = new("BasicTrackLine",
name = "GTP-KCl+NaCl peaks",
description = "Merged 3nt RTstop peaks, reproducible in all 3 replicates per condition, partial-overlap resolved.") )
save(rng, file = "./data/peaks_all.rda")
Constructs with in general low counts were removed from the differential analysis, because for these we don’t have the count power to tell if they are sensitive to KCl or not. In order to do so, counts per construct were normalized to library size with DESeq2 and only constructs with at least 100 normalized counts from all 6 samples were retained.
# Select summed counts per construct
m = sapply(constructSignal, sum) %>%
as.data.frame() %>%
dplyr::select(1:6)
# Normalize counts by library size
meta = data.frame(id = 1:6, condition = c(rep("GTP_KCL",3), rep("GTP_NACL", 3)))
dds = DESeqDataSetFromMatrix(countData = m, colData = meta, design = ~1)
dds = estimateSizeFactors(dds)
normCounts = counts(dds, normalized = TRUE)
# Plot count diagnostic
df = normCounts %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
pivot_longer(-rowname) %>%
group_by(name) %>%
mutate(rank = rank(value, ties.method = "first"))
ggplot(df, aes(x = rank, y = value)) +
ggrastr::rasterise(geom_pointdensity(), dpi = 300) +
facet_wrap(~name) +
theme_nice() +
geom_hline(yintercept = 100, linetype = "dashed") +
scale_y_log10() +
scale_color_viridis(option = "rocket") +
theme(legend.position = "top") +
guides(color = guide_colorbar(title.position = 'top', title.hjust = 0.5,
barwidth = unit(20, 'lines'), barheight = unit(.5, 'lines'))) +
labs(x = "Rank",
y = "Normalized counts")
Figure 2: Global expression
Counts were normalized for library size with DESeq. These were summed up per construct and sorted by their rank. Only constructs with at least 100 normalized counts in all 6 samples are considered sufficiently large expressed.
# Select constructs above threshold
const_Expressed = as.data.frame(table(df$rowname[df$value >= 100]))
const_Expressed = const_Expressed$Var1[const_Expressed$Freq == 6]
# Exclude all constrcuts that are not above threshold
const_All = levels(seqnames(peaks_all))
const_notExpressed = const_All[!const_All %in% const_Expressed]
# Exclude all constructs with high enough expression only in the read-through region
# -> no peak was called on these
const_withPeak = unique(seqnames(peaks_all))
const_onlyReadThroughSignal = const_Expressed[!const_Expressed %in% const_withPeak]
const_ExpressedAndPeak = const_Expressed[!const_Expressed %in% const_onlyReadThroughSignal]
peaks_expressed = peaks_all[seqnames(peaks_all) %in% const_ExpressedAndPeak]
To compute fold-changes per peak for each construct between the NaCl and KCl condition I used a DESeq2 based approach. In order to remove the bias of constructs that strongly change in their expression between the conditions, I used a modified DESeq2 test.
Here I specifically tested for those constructs that significantly change less than a log2 fold-change of 1 (less than 2-fold), using the parameter altHypothesis = "lessAbs" and an lfcThreshold = 1 in the results function. This results in all constructs that are affected by a less than 2-fold global change between conditions.
# Get total counts per construct
m = sapply(constructSignal, sum) %>%
as.data.frame() %>%
dplyr::select(c(1:6))
m = m[rownames(m) %in% unique(seqnames(peaks_expressed)),]
# Run DESeq2
dds = DESeqDataSetFromMatrix(countData = m, colData = meta, design = ~condition)
dds = DESeq(dds)
resGlobalNot = results(dds, contrast = c("condition", "GTP_KCL", "GTP_NACL"),
altHypothesis = "lessAbs", lfcThreshold = 1, alpha = 0.05) %>%
as.data.frame() %>% mutate(sig = ifelse(padj < 0.05, TRUE, FALSE)) %>%
mutate(sigDir = ifelse(sig == FALSE & log2FoldChange > 0, "Up",
ifelse(sig == FALSE & log2FoldChange < 0, "Down", "Not"))) %>%
mutate(sigDir = factor(sigDir, levels = c("Not", "Up", "Down")))
# Filter results
const_globalLarge = rownames(resGlobalNot[resGlobalNot$sig == FALSE,])
const_globalSmall = rownames(resGlobalNot[resGlobalNot$sig == TRUE,])
peaks_globalLarge = peaks_expressed[seqnames(peaks_expressed) %in% const_globalLarge]
peaks_globalSmall = peaks_expressed[seqnames(peaks_expressed) %in% const_globalSmall]
df = resGlobalNot %>%
mutate(sig = ifelse(padj < 0.05, TRUE, FALSE))
dfN = df %>%
group_by(sig) %>%
tally()
p1 = ggplot(dfN, aes(x = "Construtcts", y = n, fill = sig, color = sig)) +
geom_col(position = "fill") +
scale_fill_manual(values = c("#999999", "#D8D9CF")) +
scale_color_manual(values = c("#4d4d4d", "#979a7e")) +
theme_pub() +
theme(legend.position = "top") +
labs(
x = "",
y = "Fraction",
fill = "Significant",
color = "Significant"
) +
guides(color = guide_legend(override.aes = list(size = 1))) +
theme(legend.key.size = unit(0.5, 'cm'), legend.position = "top") +
coord_flip()
p2 = ggplot(df, aes(x = log2FoldChange)) +
annotate("rect", xmin = -1, xmax = 1, ymax = Inf, ymin = -Inf, fill = "#D8D9CF") +
geom_histogram(bins = 50, fill = "#999999", color = "#4d4d4d") +
theme_pub() +
theme(aspect.ratio = 1) +
geom_vline(xintercept = -1, linetype = "dashed") +
geom_vline(xintercept = 1, linetype = "dashed") +
labs(
x = "Fold-change[log2]",
y = "Count"
)
p1 / p2 + patchwork::plot_layout(heights = c(0.1, 0.9))
Figure 3: Basic plots of constructs that do not change more than 2-fold globaly
p1 = ggplot() +
geom_point(data = df, aes(x = log2(baseMean), y = log2FoldChange, fill = sig, color = sig), shape = 21) +
scale_fill_manual(values = c("#999999", "#D8D9CF")) +
scale_color_manual(values = c("#4d4d4d", "#979a7e")) +
theme_pub() +
theme(legend.position = "top") +
labs(
x = "BaseMean [log2]",
y = "Fold-change [log2]",
fill = "Significant",
color = "Significant"
) +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
theme(legend.position = "top") +
ylim(-3,3) +
theme(aspect.ratio = 1)
p2 = ggplot() +
geom_point(data = df, aes(x = log2FoldChange, y = -log10(padj), fill = sig, color = sig), shape = 21) +
scale_fill_manual(values = c("#999999", "#D8D9CF")) +
scale_color_manual(values = c("#4d4d4d", "#979a7e")) +
theme_pub() +
theme(legend.position = "top") +
labs(
x = "Fold-change [log2]",
y = "Adjusted P value [-log10]",
fill = "Significant",
color = "Significant"
) +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
theme(legend.position = "top") +
xlim(-3,3) +
theme(aspect.ratio = 1)
p1 + p2
Figure 4: MA and volcanoe plots of constructs that do not change more than 2-fold globaly
Next I run a second round of DESeq2 on the level of individual RT-Stop peaks. Here I now test for each peak if counts are different between NaCl and KCl using (contrast = c("condition", "GTP_KCL", "GTP_NACL")). Lastly, all peaks from constructs that change their expression level significantly are removed.
# Get crosslink counts per peak
bds = setRanges(bds, peaks_expressed)
cov = coverageOverRanges(bds, returnOptions = "merge_positions_keep_replicates")
# Prepare SE object
se = SummarizedExperiment(assays = list(counts = as.matrix(mcols(cov))),
rowRanges = granges(cov), colData = meta)
# Run DESeq2
dds = DESeqDataSet(se, design = ~ condition)
dds = DESeq(dds)
resSimple = lfcShrink(dds, contrast = c("condition", "GTP_KCL", "GTP_NACL"), type = "ashr")
resSimple = as.data.frame(resSimple) %>%
mutate(sig = ifelse(padj < 0.05, TRUE, FALSE)) %>%
mutate(sigDir = ifelse(sig == TRUE & log2FoldChange > 0, "Up",
ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not"))) %>%
mutate(sigDir = factor(sigDir, levels = c("Not", "Up", "Down")))
idx = match(rownames(resSimple), names(cov))
resSimple$constructID = as.character(seqnames(cov))[idx]
# Combine construct and peak results
resChange = resSimple[resSimple$constructID %in% const_globalSmall,]
df = resSimple %>%
mutate(sig = ifelse(padj < 0.05, TRUE, FALSE)) %>%
mutate(sigDir = ifelse(sig == TRUE & log2FoldChange > 0, "Up",
ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not"))) %>%
mutate(sigDir = factor(sigDir, levels = c("Not", "Up", "Down")))
dfN = df %>%
group_by(sigDir) %>%
tally()
p1 = ggplot(dfN, aes(x = "Construtcts", y = n, fill = sigDir, color = sigDir)) +
geom_col(position = "fill") +
scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
theme_pub() +
theme(legend.position = "top") +
labs(
x = "",
y = "Fraction",
fill = "Significant",
color = "Significant"
) +
guides(color = guide_legend(override.aes = list(size = 1))) +
theme(legend.key.size = unit(0.5, 'cm'), legend.position = "top") +
coord_flip()
p2 = ggplot(df, aes(x = log2FoldChange, fill = sigDir, color = sigDir)) +
geom_histogram(bins = 50, fill = "#999999", color = "#4d4d4d") +
theme_pub() +
theme(aspect.ratio = 1) +
geom_vline(xintercept = 0, linetype = "dashed") +
labs(
x = "Fold-change[log2]",
y = "Count"
)
p1 / p2 + patchwork::plot_layout(heights = c(0.1, 0.9))
Figure 5: Basic plots of peaks that change between conditions
dfPlot = resChange %>%
mutate(sig = ifelse(sig == TRUE & log2FoldChange > 0, "Up", ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not"))) %>%
mutate(sig = factor(sig, levels = c("Not", "Up", "Down"))) %>%
arrange(desc(sig))
p1 = ggplot(dfPlot, aes(x = log2(baseMean), y = log2FoldChange, color = sig, fill = sig)) +
geom_point(shape = 21) +
scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
theme_pub() +
theme(legend.position = "top") +
labs(
y = "Fold-change [log2]",
x = "BaseMean [log2]",
fill = "Significant",
color = "Significant"
) +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
theme(legend.position = "top") +
ylim(-8,8) +
theme(aspect.ratio = 1)
p2 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig, fill = sig)) +
geom_vline(xintercept = 0, linetype = "dashed") +
geom_point(shape = 21) +
scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
theme_pub() +
theme(legend.position = "top") +
labs(
x = "Fold-change [log2]",
y = "Adjusted P value [-log10]",
fill = "Significant",
color = "Significant"
) +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
theme(legend.position = "top") +
xlim(-8,8) +
theme(aspect.ratio = 1)
p1 + p2
Figure 6: MA and volcanoe plots of constructs that do not change more than 2-fold globaly
dfPlot = resChange %>%
mutate(sig = ifelse(sig == TRUE, "Yes", "No")) %>%
mutate(sig = factor(sig, levels = c("No", "Yes"))) %>%
arrange(desc(sig))
p1 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig)) +
geom_vline(xintercept = 0, linetype = "dashed") +
ggrastr::rasterise(geom_point(size = 1), dpi = 300) +
scale_color_manual(values = c("#808080", pal_npg()(10)[3])) +
theme_pub() +
theme(legend.position = "top") +
xlim(-8,8) +
theme(aspect.ratio = 1) +
labs(
x = "Fold-change [log2]",
y = "Adjusted P value [-log10]",
title = "Version 01"
) +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top")
dfPlot = resChange %>%
mutate(sig = ifelse(sig == TRUE & log2FoldChange > 0, "Up", ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not"))) %>%
mutate(sig = factor(sig, levels = c("Not", "Up", "Down"))) %>%
arrange(desc(sig))
p2 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig)) +
geom_vline(xintercept = 0, linetype = "dashed") +
ggrastr::rasterise(geom_point(size = 1), dpi = 300) +
scale_color_manual(values = c("#999999", "#874C62", "#68b1a5")) +
theme_pub() +
theme(legend.position = "top") +
xlim(-8,8) +
theme(aspect.ratio = 1) +
labs(
x = "Fold-change [log2]",
y = "Adjusted P value [-log10]",
title = "Version 02"
) +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top")
p3 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig, fill = sig)) +
geom_vline(xintercept = 0, linetype = "dashed") +
ggrastr::rasterise(geom_point(size = 1, shape = 21), dpi = 300) +
scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
theme_pub() +
theme(legend.position = "top") +
xlim(-8,8) +
theme(aspect.ratio = 1) +
labs(
x = "Fold-change [log2]",
y = "Adjusted P value [-log10]",
title = "Version 03"
) +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top")
p1 + p2 + p3
Figure 7: Volcano selection for figures
In the following every of the 12,000 constructs gets assigned to exactly one classification term, which are further grouped into three main categories. A construct gets assigned to term Not analyzed if it was not expressed high enough, had no peak called or if we observed a significant expression level change. All other constructs are classified as G4 containing or Not G4 containing based on the presence of a RT-Stop peak that significantly changes up or down between KCl and NaCl.
# 1) For all constructs
# -> Classify expression status
df1 = data.frame(constructID = const_notExpressed, status = "Not expressed")
df2 = data.frame(constructID = const_onlyReadThroughSignal, status = "No peak")
df3 = data.frame(constructID = const_globalLarge, status = "Global change")
# 2) For all "Expressed + small change" constructs
# -> Classify significance of constructs based on peaks
idx = match(names(peaks_globalSmall), rownames(resChange))
mcols(peaks_globalSmall) = cbind.data.frame(mcols(peaks_globalSmall), resChange[idx,])
peaks_globalSmall_sig = peaks_globalSmall[peaks_globalSmall$sig == TRUE]
const_globalSmall_sig = as.character(unique(seqnames(peaks_globalSmall_sig)))
const_globalSmall_sigNot = const_globalSmall[!const_globalSmall %in% const_globalSmall_sig]
df4 = data.frame(constructID = const_globalSmall_sigNot, status = "Not significant")
# 3) For all "Expressed + small change" and "Significant"
# -> Classify direction
resChange$dir = ifelse(resChange$log2FoldChange > 0, "Up", "Down")
constTable = as.data.frame(table(resChange[resChange$sig == TRUE,]$constructID, resChange[resChange$sig == TRUE,]$dir)) %>%
pivot_wider(names_from = c(Var2), values_from = Freq) %>% as.data.frame()
sig_up = constTable$Var1[constTable$Down == 0 & constTable$Up >= 1]
sig_down = constTable$Var1[constTable$Down >= 1 & constTable$Up == 0]
sig_mixed = constTable$Var1[!constTable$Var1 %in% sig_up]
sig_mixed = sig_mixed[!sig_mixed %in% sig_down]
df5 = data.frame(constructID = sig_up, status = "Significant-Up")
df6 = data.frame(constructID = sig_down, status = "Significant-Down")
df7 = data.frame(constructID = sig_mixed, status = "Significant-Mixed")
dfAssignment = rbind(df1,df2,df3,df4,df5,df6,df7)
dfAssignment$statusCombined = ifelse(dfAssignment$status == "Global change" | dfAssignment$status == "No peak" | dfAssignment$status == "Not expressed", "Not analysed",
ifelse(dfAssignment$status == "Not significant" | dfAssignment$status == "Significant-Down", "No G4","G4"))
df = dfAssignment %>% group_by(status, statusCombined) %>% tally()
ggplot(df, aes(x = statusCombined, y = n, fill = status)) +
geom_col(position = "dodge") +
scale_fill_d3() +
theme_pub() +
labs(
x = "#Classification group",
y = "#N",
fill = "Classification term"
) +
geom_text(aes(label = n), vjust=-0.4, size = 3, position = position_dodge(width = .9)) +
guides(color = guide_legend(override.aes = list(size = 1))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top")
Figure 8: Construct classification in terms and groups
Here I update the constructInfo dataframe, which is used in the shiniy app.
### 1) append construct info list by G4 status
idx = match(constructInfo$Construct.ID, dfAssignment$constructID)
constructInfo2 = cbind.data.frame(constructInfo, dfAssignment[idx,])
### 2) add information to peaks granges
idx = match(names(peaks_expressed), rownames(resSimple))
mcols(peaks_expressed) = cbind(mcols(peaks_expressed), resSimple[idx,])
idx = match(peaks_expressed$constructID, dfAssignment$constructID)
mcols(peaks_expressed) = cbind(mcols(peaks_expressed), dfAssignment[idx,])
peaks_expressed = peaks_expressed[peaks_expressed$constructID %in% const_globalSmall]
idx = match(peaks_expressed$constructID, constructInfo2$Construct.ID)
mcols(peaks_expressed)$RegionID = constructInfo2$Region.ID[idx]
### 3) append construct info list with number of peak info per category
# add number of peaks per construct to info table - all peaks
df1 = peaks_expressed %>%
mcols() %>%
as.data.frame() %>%
select(statusCombined, constructID) %>%
group_by(constructID) %>%
summarise(nPeaks = n())
idx = match(constructInfo2$constructID, df1$constructID)
constructInfo2$nPeaks = df1$nPeaks[idx]
# add number of peaks per construct to info table - significant peaks
df2 = peaks_expressed %>%
mcols() %>%
as.data.frame() %>%
filter(sig == TRUE) %>%
select(statusCombined, constructID) %>%
group_by(constructID) %>%
summarise(nPeaks = n())
idx = match(constructInfo2$constructID, df2$constructID)
constructInfo2$nPeaksSig = df2$nPeaks[idx]
# add number of peaks per construct to info table - significant peaks up
df3 = peaks_expressed %>%
mcols() %>%
as.data.frame() %>%
filter(sig == TRUE & log2FoldChange > 0) %>%
select(statusCombined, constructID) %>%
group_by(constructID) %>%
summarise(nPeaks = n())
idx = match(constructInfo2$constructID, df3$constructID)
constructInfo2$nPeaksSigUp = df3$nPeaks[idx]
### 4) save and export
saveRDS(constructInfo2, file = "./data/constructInfo2.rds")
saveRDS(peaks_expressed, file = "./data/peaks_expressed.rds")
head(constructInfo2)
## Construct.ID Region.ID LSV.ID
## 1 00001 regulated.797 ENSG00000156976.15:s:186787776-186788256
## 2 00002 regulated.845 ENSG00000116350.16:t:29160375-29160563
## 3 00003 non-regulated.1087 ENSG00000178927.17:t:82446623-82446696
## 4 00004 non-regulated.720 ENSG00000126214.21:s:103687081-103687211
## 5 00005 regulated.1320 ENSG00000137501.17:s:85707429-85707531
## 6 00006 regulated.176 ENSG00000169592.14:s:30000929-30001040
## Gene.ID Gene.Name regulation Hill.cat region type original.width
## 1 ENSG00000156976 EIF4A2 regulated 4 DE5 BS 5
## 2 ENSG00000116350 SRSF4 regulated 1 UI5 pG4+BS 49
## 3 ENSG00000178927 CYBC1 non-regulated <NA> UI3 BS 5
## 4 ENSG00000126214 KLC1 non-regulated <NA> DI3 BS 5
## 5 ENSG00000137501 SYTL2 regulated 1 DI5 BS 5
## 6 ENSG00000169592 INO80E regulated 4 UI5 BS 5
## strand coords barcode
## 1 + chr3:186789102-186789247 TGCCTACAGACGGTC
## 2 - chr1:29181393-29181538 ATTACGAGGCTGGCA
## 3 - chr17:82447563-82447708 AGTCTTTTATCAGCT
## 4 + chr14:103700609-103700754 AAGCTGGACTCATCC
## 5 - chr11:85704360-85704505 TGTGGACCTGCATGT
## 6 + chr16:30001055-30001200 TGGAAACGTAGGATG
## Seq
## 1 TAATACGACTCACTATAGTTCTGATTCTTTTTCTTACACAGAATTGGCAGAGGGGGTCGATTTGGGAGGAAAGGTGTGGCTATAAACTTTGTTACTGAAGAAGACAAGAGGATTCTTCGTGACATTGAGACTTTCTACAATACTACAGTGGAGGAGATGCCCATTGCCTACAGACGGTCAGATCGGAAGAGCGGTTCAGC
## 2 TAATACGACTCACTATAGGCCTGGCCAGGCTCCGCCATAGTGGCGGCGCCTCGGGGGCGCGGGTAGGCCCCGGTTCCCACTGCGCCTGCGCGGGTGGGGGTGGTGGAGGCATAATGGGAGCTGTGCTGAGGCCCTGAGTGGGACGGGGAGAAGGAAAGCCCGAGATTACGAGGCTGGCAAGATCGGAAGAGCGGTTCAGC
## 3 TAATACGACTCACTATAGTCAGGGGCTGTGGCTCCTGGTCCCGCTGCACCAGGGAGGTGGGGTCTTACCGGGATCAATAGGCAGGCAGTCACTTTCTCTTCATAGGAATCTTGTCGATTGGCCTGGCTGCTGCCTACTACAGCGGAGGTAAAGCACCCCCCGCCAGTCTTTTATCAGCTAGATCGGAAGAGCGGTTCAGC
## 4 TAATACGACTCACTATAGGCCGCCTGCACGCCCTGAGTGAAACTCGTCTGTGCTTCATCTCCAGGGCGTCTCTGGCCGAGCCTCTTTTTGTGGAAAACGACAGCAGCAGCAGTGGCCTGGAAGACGCCACCGCTAACGTGAGTCCCACGGCCTGCAGCCCCAGGAAGCTGGACTCATCCAGATCGGAAGAGCGGTTCAGC
## 5 TAATACGACTCACTATAGAAAACTAGTTCCTGAAATCTTTTATATCCTCATGGGTTTTATGTCTGTGTGTACTGTGAATTACTAGGAGAGTTACAATTGTGAATTTGTCTTTTTCTCCTTTTAGTTTTATTATTTTTGATTCACTATATTTGAGATTGTCTTATTGTGGACCTGCATGTAGATCGGAAGAGCGGTTCAGC
## 6 TAATACGACTCACTATAGGGGATGGGAAGTGCTTGGGAGTAAGGTGGGCGTCATTTGGGCAGAAGGGCTGGGCCTCGGGGCAAGGCCAGCCCAACTTTGGGGACATCTGTCTGGGTGTGGCCCAAGTGTCCCGTGGAGGGTGTGAGTCGGGGCTGCCCCCTTCTTGGAAACGTAGGATGAGATCGGAAGAGCGGTTCAGC
## constructID status statusCombined nPeaks nPeaksSig nPeaksSigUp
## 1 00001 No peak Not analysed NA NA NA
## 2 00002 Global change Not analysed NA NA NA
## 3 00003 Significant-Up G4 8 1 1
## 4 00004 No peak Not analysed NA NA NA
## 5 00005 Not expressed Not analysed NA NA NA
## 6 00006 Significant-Mixed G4 8 8 7
Create an overview table with all numbers from the final assignment.
library(janitor)
df = constructInfo2 %>%
group_by(statusCombined) %>%
summarise(nConst = n(),
nRegion = length(unique(Region.ID)),
nPeaks = sum(nPeaks, na.rm = T),
nPeaksSig = sum(nPeaksSig, na.rm = T),
nPeaksSigUp = sum(nPeaksSigUp, na.rm = T)) %>%
adorn_totals() %>%
tibble::column_to_rownames("statusCombined")
df = df[c(3,2,1,4),]
colnames(df) = c("#Constructs", "#Rregions", "#Peaks", "#Peaks-sig", "#Peaks-sig-up")
df = myNumberFormat(df)
kable(df, "html", caption = "Construct, region and peak classifcation results.") %>%
kableExtra::kable_styling("striped") %>%
kableExtra::scroll_box(height="300px", width = "94%")
| #Constructs | #Rregions | #Peaks | #Peaks-sig | #Peaks-sig-up | |
|---|---|---|---|---|---|
| Not analysed | 4,427 | 1,919 | 0 | 0 | 0 |
| No G4 | 3,997 | 1,473 | 16,484 | 1,524 | 0 |
| G4 | 3,576 | 1,306 | 20,140 | 10,693 | 8,986 |
| Total | 12,000 | 4,698 | 36,624 | 12,217 | 8,986 |
# kable(myNumberFormat(df), caption = "Construct, region and peak classifcation results.")
Since each of the 3.000 genomic regions is represented 4 times by the constructs we need to merge constructs back to their original region. The region consensus heatmap and barcharts below shows how well the 4 construct replicates of each region agree in their classification.
m = constructInfo2 %>%
group_by(Region.ID, statusCombined) %>%
summarise(n = (n())) %>%
pivot_wider(names_from = statusCombined, values_from = n) %>%
replace(is.na(.), 0) %>%
as.data.frame() %>%
select(-c(Region.ID)) %>%
relocate("G4", "No G4", "Not analysed")
m = m[order(m$`Not analysed`, decreasing = TRUE),]
m = m[order(m$`No G4`, decreasing = TRUE),]
m = m[order(m$`G4`, decreasing = TRUE),]
custom.col = structure(c("white", pal_material(palette = "blue-grey")(10)[3],
pal_material(palette = "blue-grey")(10)[5], pal_material(palette = "blue-grey")(10)[7],
pal_material(palette = "blue-grey")(10)[9]
), names = c("0", "1", "2", "3", "4")) # black, red, green, blue
h = Heatmap(m, column_title = "Region consensus", name = "#construct support per region",
cluster_columns = FALSE, cluster_rows = FALSE,
show_row_names = FALSE, show_column_names = TRUE,
show_row_dend = FALSE,
border = TRUE, col = custom.col,
heatmap_legend_param = list(direction = "horizontal", nrow = 1),
use_raster = FALSE
)
draw(h, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
Figure 9: Region and construct agreement based on classification type
Rows = regions; cols = classification; n = number of constructs.
colFill = c("#cccccc", "#BB6464", "#7897AB")
colCol = c("#a6a6a6", "#a04646", "#58788d")
df = m %>%
as.data.frame() %>%
mutate(class = ifelse(G4 %in% c(4,3,2), "G4", ifelse(`No G4` %in% c(4,3,2) & !`G4` %in% c(4,3,2), "No G4", "Not Analyzed"))) %>%
mutate(across(where(is.numeric), function(x){ifelse(x > 1, 1, x)})) %>%
pivot_longer(-class) %>% group_by(class, name, value) %>% tally() %>% filter(value == 1) %>% ungroup() %>% group_by(class) %>%
# filter(!(class == "G4" & name == "No G4")) %>%
# filter(!(class == "No G4" & name == "G4")) %>%
mutate(name = factor(name, levels = (c("Not analysed", "No G4", "G4"))),
class = factor(class, level = rev(c("G4", "No G4", "Not Analyzed"))))
ggplot(df, aes(x = class, y = n, fill = name, color = name)) +
geom_col(position = "dodge", size = 0.8) +
scale_fill_manual(values = colFill) +
scale_color_manual(values = colCol) +
theme_pub() +
guides(color = guide_legend(override.aes = list(size = 0.8))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
labs(
x = "Region class",
y = "#Constructs",
fill = "Construct class",
color = "Construct class"
) +
coord_flip()
Figure 10: Number of constructs classified in other regions compared to those classifed as in the same region
For all 3,576 constructs where we observed a G4 (based on up-regulated G4 peaks) we now choose a single construct to represent the entire region from which it originated. This is done by always selecting the construct with the most counts if at least two constructs of this region were classified as up-regulated. The same definition is used to select No G4 containing constructs for the respective region, with the additional constraint that no construct was classified as G4 containing beforehand.
### ============================================================================
### Choose representative construct for region -> G4 Regions
### ----------------------------------------------------------------------------
### Rule: Take the construct with the highest counts; at least 2/4 constructs
### must be classified as G4
###
# Count crosslink sum per construct and match construct counts with region identifier
constCounts = sapply(constructSignal, sum) %>%
as.data.frame() %>%
dplyr::select(1:6)
G4Const = constCounts[rownames(constCounts) %in% dfAssignment$constructID[dfAssignment$statusCombined == "G4"],]
G4Const$Region.ID = constructInfo2$Region.ID[match(rownames(G4Const), constructInfo2$Construct.ID)]
### 1) keep only those regions that have at least two constructs classified as G4 containing
### -> present in the set G4Const
# diagnostic
df1 = G4Const %>%
group_by(Region.ID) %>%
summarise(n = n()) %>%
group_by(n) %>%
tally(name = "nn") %>%
mutate(n = as.factor(n)) %>%
mutate(type = "G4 Regions")
# select for G4 regions with at least 2 G4 constructs
keep = G4Const %>%
group_by(Region.ID) %>%
summarise(n = n()) %>%
filter(n >= 2) %>%
pull(Region.ID)
G4Const = G4Const[G4Const$Region.ID %in% keep,]
# select the construct with the most counts as representative for the G4 region
keep = G4Const %>%
mutate(sCounts = rowSums(across(contains("rep")))) %>%
select(-contains("rep")) %>%
tibble::rownames_to_column("constructID") %>%
group_by(Region.ID) %>%
filter(sCounts == max(sCounts)) %>%
pull(constructID)
G4Const = G4Const[rownames(G4Const) %in% keep,]
### ============================================================================
### Choose representative construct for region -> No G4 Regions
### ----------------------------------------------------------------------------
### Rule: Take the construct with the highest counts; at least 2/4 constructs
### must be classified as G4; the region can not be already labeled as G4
###
noG4Const = constCounts[rownames(constCounts) %in% dfAssignment$constructID[dfAssignment$statusCombined == "No G4"],]
noG4Const$Region.ID = constructInfo2$Region.ID[match(rownames(noG4Const), constructInfo2$Construct.ID)]
df2 = noG4Const %>%
filter(!Region.ID %in% G4Const$Region.ID) %>%
group_by(Region.ID) %>%
summarise(n = n()) %>%
group_by(n) %>%
tally(name = "nn") %>%
mutate(n = as.factor(n)) %>%
mutate(type = "No G4 Regions")
# select for no G4 regions with at least to no G4 constructs
# -> non of these are allowed to be classified as G4
keep = noG4Const %>%
filter(!Region.ID %in% G4Const$Region.ID) %>%
group_by(Region.ID) %>%
summarise(n = n()) %>%
filter(n >= 2) %>%
pull(Region.ID)
noG4Const = noG4Const[noG4Const$Region.ID %in% keep,]
# select the construct with the most counts as representative for the G4 region
keep = noG4Const %>%
mutate(sCounts = rowSums(across(contains("rep")))) %>%
select(-contains("rep")) %>%
tibble::rownames_to_column("constructID") %>%
group_by(Region.ID) %>%
dplyr::slice(which.max(sCounts)) %>%
pull(constructID)
noG4Const = noG4Const[rownames(noG4Const) %in% keep,]
### ============================================================================
### Make plot
### ----------------------------------------------------------------------------
###
df = rbind.data.frame(df1,df2)
dfNg4 = df %>%filter(n != 1 & type == "G4 Regions") %>% summarise(s = sum(nn))
dfNnog4 = df %>%filter(n != 1 & type == "No G4 Regions") %>% summarise(s = sum(nn))
colFill = c("#7897AB", "#BB6464")
colCol = c("#58788d", "#a04646")
ggplot(df, aes(x = n, y = nn, fill = type, color = type)) +
geom_col(position = "dodge", size = 1) +
theme_pub() +
labs(
x = "#Constructs",
y = "#Regions",
fill = "Region type",
color = "Region type"
) +
geom_text(aes(label = nn), vjust=-0.4, size = 3, position = position_dodge(width = .9)) +
scale_fill_manual(values = colFill) +
scale_color_manual(values = colCol) +
guides(color = guide_legend(override.aes = list(size = 1))) +
theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
annotate("text", x = 1, y = 450, hjust = 0.3, size = 3,
label = paste0("#G4 Regions: ", myNumberFormat(dfNg4$s),
"\n#No G4 Regions: ", myNumberFormat(dfNnog4$s)))
Figure 11: Region classification by construct agreement
The above definition results in 1,060 G4 and 1,109 No G4 merged construct regions.
Here we assign a representative RT-Stop peak to each construct, since multiple peaks may exist. As representative always the strongest upregulated RT-Stop peak is chosen. Peak strength is determined by the ratio of RT-Stops within a given peak over the total RT-Stop of the construct. The total strength of a construct is then the sum of all peak strength divided by the number of peaks.
# Calculate peak and construct strength
constCountsNew = constCounts %>% tibble::rownames_to_column("ConstructID")
AllG4Peaks = cov[seqnames(cov) %in% rownames(G4Const)]
mcols(AllG4Peaks) = mcols(AllG4Peaks)[1:3]
AllG4Peaks = AllG4Peaks[names(AllG4Peaks) %in% rownames(resChange[resChange$dir == "Up",])]
dfPre = as.data.frame(AllG4Peaks) %>%
dplyr::mutate(xlinksKCL = X1_GTP_KCL + X2_GTP_KCL + X3_GTP_KCL) %>%
tibble::rownames_to_column("PeakID") %>%
select(PeakID, seqnames, xlinksKCL) %>%
rename("ConstructID" = "seqnames") %>%
group_by(ConstructID) %>%
left_join( y = constCountsNew, by = "ConstructID") %>%
mutate(xlinksKCL_total = GTP_KCL_rep1 + GTP_KCL_rep2 + GTP_KCL_rep3) %>%
mutate(xlinksKCL_ratio = xlinksKCL / xlinksKCL_total) %>%
select(c(PeakID, ConstructID, starts_with("xlinks"))) %>%
arrange(desc(xlinksKCL_ratio), .by_group = TRUE) %>%
mutate(sel = as.factor(seq_along(PeakID)) ) %>%
summarise(PeakID, ConstructID, xlinksKCL, xlinksKCL_total, xlinksKCL_ratio, sel, n = n(), sum_KCL_ratio = sum(xlinksKCL_ratio)) %>%
mutate(sum_KCL_ratio_norm = sum_KCL_ratio/ n)
df1 = data.frame(selection = "Best peak ratio", value = dfPre %>% filter(sel == 1) %>% pull(xlinksKCL_ratio))
df2 = data.frame(selection = "Total construct ratio", value = dfPre %>% filter(sel == 1) %>% pull(sum_KCL_ratio_norm))
df = rbind(df1,df2)
ggplot(df, aes(x = value, fill = selection)) +
geom_density() +
facet_wrap(~selection) +
theme_pub() +
theme(legend.position = "none") +
labs(
x = "Indicated ratio",
y = "Density"
) +
scale_fill_simpsons()
Figure 12: Distribution of peak and construct strength ratios
From the total of 3,775 peaks we observe 1,060 peaks on the selected representative constructs, with a mean, median and maximum of 4.7, 4 and 4 peaks per construct, respectively.
dfPlot = dfPre %>%
select(ConstructID, n, sum_KCL_ratio, sum_KCL_ratio_norm) %>%
unique() %>%
ungroup() %>%
mutate(n = ifelse(n >= 10, ">=10", n)) %>%
mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))
colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")
p1 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V1",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio)]"
)
dfMean = dfPlot %>%
group_by(n) %>%
summarise(med = median(sum_KCL_ratio_norm), mean = mean(sum_KCL_ratio_norm), N = n())
p2 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V2",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p3 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(0,0.4)) +
labs(
title = "G4 potential of construct - V3",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p4 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.4)) +
labs(
title = "G4 potential of construct - V4",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p5 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.4)) +
labs(
title = "G4 potential of construct - V5",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p6 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_half_boxplot(outlier.shape = NA) +
geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.4)) +
labs(
title = "G4 potential of construct - V6",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p1 + p2 + p3 + p4 + p5 + p6
Figure 13: Construct G4 potential
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).
The peak strength is computed as a ratio of crosslinks within a peak over the total number of crosslinks on the hosting construct. The sum of this ratio from all peaks of a given constructs denotes the G4 potential of the respective construct. This sums is then divided by the number of peaks for normalization and give the total construct ratio.
dfBestRatio = dfPre %>%
group_by(ConstructID) %>%
dplyr::slice(1) %>%
ungroup() %>%
mutate(n = ifelse(n >= 10, ">=10", n)) %>%
mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))
colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")
p1 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V1",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio)]"
)
dfMean = dfBestRatio %>%
group_by(n) %>%
summarise(med = median(xlinksKCL_ratio), mean = mean(xlinksKCL_ratio), N = n())
p2 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V2",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p3 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(0,0.55)) +
labs(
title = "G4 potential of construct - V3",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p4 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.55)) +
labs(
title = "G4 potential of construct - V4",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p5 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.55)) +
labs(
title = "G4 potential of construct - V5",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p6 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_half_boxplot(outlier.shape = NA) +
geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.55)) +
labs(
title = "G4 potential of construct - V6",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p1 + p2 + p3 + p4 + p5 + p6
Figure 14: Construct G4 potential - based on best peak
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).
Finally I export all information for the G4 containing RT-Stop peaks as .xlsx and .rds. The G4Peaks file contains the representative G4 peak for each construct classified as G4. The AllG4Peaks file contains the relative position of all upregulated G4 peaks for each construct classified as G4.
# Match results to all G4 peaks
idx = match(names(AllG4Peaks), dfPre$PeakID)
AllG4Peaks$xlinksKCL_ratio = dfPre$xlinksKCL_ratio[idx]
AllG4Peaks$peaksPosition = sapply(strsplit(names(AllG4Peaks),"_"), `[`, 2)
AllG4Peaks$sum_KCL_ratio_norm = dfPre$sum_KCL_ratio_norm[idx]
# Match results to best G4 peak
# -> Select the strongest peak per construct as representative position
keep = dfBestRatio %>% pull(PeakID)
G4Peaks = AllG4Peaks[names(AllG4Peaks) %in% keep,]
idx = match(names(G4Peaks), dfBestRatio$PeakID)
G4Peaks$xlinksKCL_ratio = dfBestRatio$xlinksKCL_ratio[idx]
G4Peaks$peaksPosition = sapply(strsplit(names(G4Peaks),"_"), `[`, 2)
G4Peaks$sum_KCL_ratio_norm = dfBestRatio$sum_KCL_ratio_norm[idx]
# Save G4 peak info
resChange$PeakID = rownames(resChange)
df = G4Peaks %>%
as.data.frame() %>%
tibble::rownames_to_column("PeakID") %>%
rename("ConstructID" = "seqnames") %>%
mutate(RegionID = constructInfo2$Region.ID[match(ConstructID, constructInfo2$Construct.ID)]) %>%
left_join(y = resChange, by = c("PeakID")) %>%
select(-c(dir, constructID, sig, peaksPosition)) %>%
relocate(RegionID, ConstructID, PeakID)
G4Peaks$ConstructID = seqnames(G4Peaks)
G4Peaks$RegionID = df$RegionID[match(df$ConstructID, G4Peaks$ConstructID)]
write.xlsx(df, file = "./data/g4peaklist.xlsx", sheetName = "G4Peaks")
saveRDS(G4Peaks, file = "./data/G4Peaks.rds")
saveRDS(AllG4Peaks, file = "./data/AllG4Peaks.rds")
Just like for the G4 containing constructs we assign a representative peak for each of the No G4 containing constructs, computing the same ratio per peak and construct.
# Calculate peak and construct strength
AllNoG4Peaks = cov[seqnames(cov) %in% rownames(noG4Const)]
mcols(AllNoG4Peaks) = mcols(AllNoG4Peaks)[1:3]
dfNoPre = as.data.frame(AllNoG4Peaks) %>%
mutate(xlinksKCL = X1_GTP_KCL + X2_GTP_KCL + X3_GTP_KCL) %>%
tibble::rownames_to_column("PeakID") %>%
select(PeakID, seqnames, xlinksKCL) %>%
rename("ConstructID" = "seqnames") %>%
group_by(ConstructID) %>%
left_join( y = constCountsNew, by = "ConstructID") %>%
mutate(xlinksKCL_total = GTP_KCL_rep1 + GTP_KCL_rep2 + GTP_KCL_rep3) %>%
mutate(xlinksKCL_ratio = xlinksKCL / xlinksKCL_total) %>%
select(c(PeakID, ConstructID, starts_with("xlinks"))) %>%
arrange(desc(xlinksKCL_ratio), .by_group = TRUE) %>%
mutate(sel = as.factor(seq_along(PeakID)) ) %>%
summarise(PeakID, ConstructID, xlinksKCL, xlinksKCL_total, xlinksKCL_ratio, sel, n = n(), sum_KCL_ratio = sum(xlinksKCL_ratio)) %>%
mutate(sum_KCL_ratio_norm = sum_KCL_ratio/ n)
dfPlot = dfNoPre %>%
select(ConstructID, n, sum_KCL_ratio, sum_KCL_ratio_norm) %>%
unique() %>%
ungroup() %>%
mutate(n = ifelse(n >= 10, ">=10", n)) %>%
mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))
colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")
p1 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V1",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio)]"
)
dfMean = dfPlot %>%
group_by(n) %>%
summarise(med = median(sum_KCL_ratio_norm), mean = mean(sum_KCL_ratio_norm), N = n())
p2 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V2",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p3 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(0,0.1)) +
labs(
title = "G4 potential of construct - V3",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p4 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.1)) +
labs(
title = "G4 potential of construct - V4",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p5 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.1)) +
labs(
title = "G4 potential of construct - V5",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p6 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
geom_half_boxplot(outlier.shape = NA) +
geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.1)) +
labs(
title = "G4 potential of construct - V6",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p1 + p2 + p3 + p4 + p5 + p6
Figure 15: No G4 construct potential
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).
dfBestRatioNoG4 = dfNoPre %>%
group_by(ConstructID) %>%
dplyr::slice(1) %>%
ungroup() %>%
mutate(n = ifelse(n >= 10, ">=10", n)) %>%
mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))
colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")
p1 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V1",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio)]"
)
dfMean = dfBestRatioNoG4 %>%
group_by(n) %>%
summarise(med = median(xlinksKCL_ratio), mean = mean(xlinksKCL_ratio), N = n())
p2 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot() +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
labs(
title = "G4 potential of construct - V2",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p3 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(0,0.55)) +
labs(
title = "G4 potential of construct - V3",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
)
p4 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.55)) +
labs(
title = "G4 potential of construct - V4",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p5 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_boxplot(outlier.shape = NA) +
geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.55)) +
labs(
title = "G4 potential of construct - V5",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p6 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
geom_half_boxplot(outlier.shape = NA) +
geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
theme_pub() +
theme( legend.position = "none") +
scale_fill_manual(values = colBright) +
scale_color_manual(values = colDark) +
coord_cartesian(ylim = c(-0.01,0.55)) +
labs(
title = "G4 potential of construct - V6",
x = "#Peaks in construct",
y = "Construct strength [sum(peak strenth ratio) / #peaks]"
) +
geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)
p1 + p2 + p3 + p4 + p5 + p6
Figure 16: No G4 construct potential - based on best peak
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).
Finally I export all information for the No G4 containing RT-Stop peaks as .xlsx and .rds. The NoG4Peaks file contains the representative No G4 peak for each construct classified as No G4. The AllNoG4Peaks file contains the relative position of all upregulated No G4 peaks for each construct classified as No G4.
# Match results to all no G4 peaks
idx = match(names(AllNoG4Peaks), dfNoPre$PeakID)
AllNoG4Peaks$xlinksKCL_ratio = dfNoPre$xlinksKCL_ratio[idx]
AllNoG4Peaks$peaksPosition = sapply(strsplit(names(AllNoG4Peaks),"_"), `[`, 2)
AllNoG4Peaks$sum_KCL_ratio_norm = dfNoPre$sum_KCL_ratio_norm[idx]
# Match results to best no G4 peak
# -> Select the strongest peak per construct as representative position
keepNoG4 = dfBestRatioNoG4 %>% pull(PeakID)
NoG4Peaks = AllNoG4Peaks[names(AllNoG4Peaks) %in% keepNoG4,]
idx = match(names(NoG4Peaks), dfBestRatioNoG4$PeakID)
NoG4Peaks$xlinksKCL_ratio = dfBestRatioNoG4$xlinksKCL_ratio[idx]
NoG4Peaks$peaksPosition = sapply(strsplit(names(NoG4Peaks),"_"), `[`, 2)
NoG4Peaks$sum_KCL_ratio_norm = dfBestRatioNoG4$sum_KCL_ratio_norm[idx]
df = NoG4Peaks %>%
as.data.frame() %>%
tibble::rownames_to_column("PeakID") %>%
rename("ConstructID" = "seqnames") %>%
mutate(RegionID = constructInfo2$Region.ID[match(ConstructID, constructInfo2$Construct.ID)]) %>%
left_join(y = resChange, by = c("PeakID")) %>%
select(-c(dir, constructID, sig, peaksPosition)) %>%
relocate(RegionID, ConstructID, PeakID)
NoG4Peaks$ConstructID = seqnames(NoG4Peaks)
NoG4Peaks$RegionID = df$RegionID[match(df$ConstructID, NoG4Peaks$ConstructID)]
write.xlsx(df, file = "./data/g4peaklist.xlsx", sheetName = "NoG4Peaks", append = TRUE)
saveRDS(NoG4Peaks, file = "./data/NoG4Peaks.rds")
saveRDS(AllNoG4Peaks, file = "./data/AllNoG4Peaks.rds")
Here we explore the G4 sequence composition for the main RT-Stop peaks of constructs with and without G4. The following plots show a combination of logos computed from all peaks or the best peak on a represetnative G4 (and No G4) construct.
getPeakSeq <- function(peak, adjacentNucs, trimRight) {
r1 = (peak) %>% as.data.frame()
seq <- constructInfo2$Seq[match(r1$seqnames, constructInfo2$Construct.ID)]
peakSeq <- substr(seq, start=r1$start-adjacentNucs, stop=r1$start+adjacentNucs-trimRight)
return(peakSeq)
}
df1 = getPeakSeq(AllG4Peaks, adjacentNucs = 10, trimRight = 9)
df2 = getPeakSeq(AllNoG4Peaks, adjacentNucs = 10, trimRight = 9)
p1 = ggseqlogo(df1) +
geom_hline(yintercept = 0) +
ylim(0,2) +
scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
labs(title = "G4 peaks")
p2 = ggseqlogo(df2) +
geom_hline(yintercept = 0) +
ylim(0,2) +
scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
labs(title = "No G4 peaks")
p1 + p2
Figure 17: Sequence logo plots
All peaks on G4 and no G4 constructs.
df1 = getPeakSeq(G4Peaks, adjacentNucs = 10, trimRight = 9)
df2 = getPeakSeq(NoG4Peaks, adjacentNucs = 10, trimRight = 9)
p1 = ggseqlogo(df1) +
geom_hline(yintercept = 0) +
ylim(0,2) +
scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
labs(title = "G4 peaks")
p2 = ggseqlogo(df2) +
geom_hline(yintercept = 0) +
ylim(0,2) +
scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
labs(title = "No G4 peaks")
p1 + p2
Figure 18: Sequence logo plots
Best peaks of G4 and no G4 constructs.
Peaks are grouped into 20% stepwise bins, where grouping is either done based on the best peak ratio or the total construct ratio. This is again done for all or the best G4 and No G4 peaks.
# Best peak per construct
# -> Split by best peak ratio
q = quantile(G4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(G4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(G4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(G4Peaks, G4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p1 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "Best G4 peaks - Split by best peak ratio",
) +
theme(axis.text=element_text(size=8))
# Best peak per construct
# -> Split by total construct ratio
q = quantile(G4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(G4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(G4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(G4Peaks, G4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p2 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "Best G4 peaks - Split by construct ratio",
) +
theme(axis.text=element_text(size=8))
# All peaks per construct
# -> Split by best peak ratio
q = quantile(AllG4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(AllG4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(AllG4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(AllG4Peaks, AllG4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p3 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "All G4 peaks - Split by best peak ratio",
) +
theme(axis.text=element_text(size=8))
# All peaks per construct
# -> Split by total construct ratio
q = quantile(AllG4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(AllG4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(AllG4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(AllG4Peaks, AllG4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p4 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "All G4 peaks - Split by construct ratio",
) +
theme(axis.text=element_text(size=8))
p1 / p2 / p3 / p4
Figure 19: G4 Peaks split by KCL ratio in 20% bins
# Best peak per construct
# -> Split by best peak ratio
q = quantile(NoG4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(NoG4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(NoG4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(NoG4Peaks, NoG4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p1 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "Best no G4 peaks - Split by best peak ratio",
) +
theme(axis.text=element_text(size=8))
# Best peak per construct
# -> Split by total construct ratio
q = quantile(NoG4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(NoG4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(NoG4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(NoG4Peaks, NoG4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p2 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "Best no G4 peaks - Split by construct ratio",
) +
theme(axis.text=element_text(size=8))
# All peaks per construct
# -> Split by best peak ratio
q = quantile(AllNoG4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(AllNoG4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(AllNoG4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(AllNoG4Peaks, AllNoG4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p3 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "All no G4 peaks - Split by best peak ratio",
) +
theme(axis.text=element_text(size=8))
# All peaks per construct
# -> Split by total construct ratio
q = quantile(AllNoG4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(AllNoG4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(AllNoG4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(AllNoG4Peaks, AllNoG4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")
p4 = ggseqlogo(df, ncol = 5) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 2, linetype = "dashed") +
ylim(0,2) +
scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
labs(
title = "All no G4 peaks - Split by construct ratio",
) +
theme(axis.text=element_text(size=8))
p1 / p2 / p3 / p4
Figure 20: No G4 Peaks split by KCL ratio in 20% bins
export_G4Constructs = G4Const %>%
tibble::rownames_to_column("Construct.ID") %>%
select(Construct.ID, Region.ID) %>%
mutate(regulationGroup = constructInfo2$statusCombined[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
mutate(regulationGroupDetail = constructInfo2$status[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
mutate(strongestPeakStrengthRatio = G4Peaks$xlinksKCL_ratio[match(Construct.ID, G4Peaks$ConstructID)]) %>%
mutate(constructStrengthRatio = G4Peaks$sum_KCL_ratio_norm[match(Construct.ID, G4Peaks$ConstructID)])
head(export_G4Constructs)
## Construct.ID Region.ID regulationGroup regulationGroupDetail
## 1 00003 non-regulated.1087 G4 Significant-Up
## 2 00011 non-regulated.82 G4 Significant-Up
## 3 00026 regulated.1137 G4 Significant-Mixed
## 4 00054 non-regulated.510 G4 Significant-Up
## 5 00076 non-regulated.111 G4 Significant-Up
## 6 00086 non-regulated.361 G4 Significant-Mixed
## strongestPeakStrengthRatio constructStrengthRatio
## 1 0.12029162 0.05847509
## 2 0.43892694 0.11887367
## 3 0.55917513 0.15376569
## 4 0.09424872 0.09424872
## 5 0.04566585 0.04566585
## 6 0.11026400 0.11026400
export_NoG4Constructs = noG4Const %>%
tibble::rownames_to_column("Construct.ID") %>%
select(Construct.ID, Region.ID) %>%
mutate(regulationGroup = constructInfo2$statusCombined[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
mutate(regulationGroupDetail = constructInfo2$status[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
mutate(strongestPeakStrengthRatio = NoG4Peaks$xlinksKCL_ratio[match(Construct.ID, NoG4Peaks$ConstructID)]) %>%
mutate(constructStrengthRatio = NoG4Peaks$sum_KCL_ratio_norm[match(Construct.ID, NoG4Peaks$ConstructID)])
head(export_NoG4Constructs)
## Construct.ID Region.ID regulationGroup regulationGroupDetail
## 1 00013 regulated.1091 No G4 Not significant
## 2 00024 regulated.1614 No G4 Significant-Down
## 3 00030 non-regulated.1125 No G4 Not significant
## 4 00040 regulated.217 No G4 Not significant
## 5 00072 regulated.755 No G4 Not significant
## 6 00075 non-regulated.1200 No G4 Not significant
## strongestPeakStrengthRatio constructStrengthRatio
## 1 0.012107531 0.012107531
## 2 0.012225067 0.010971214
## 3 0.006931074 0.006931074
## 4 0.317385445 0.146731806
## 5 0.012882448 0.012882448
## 6 0.040380048 0.040380048
constructClassification = rbind.data.frame(export_G4Constructs, export_NoG4Constructs)
saveRDS(constructClassification, file = "./data/constructClassification.rds")
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] janitor_2.1.0 stringr_1.4.1
## [3] ggseqlogo_0.1 xlsx_0.6.5
## [5] BindingSiteFinder_1.3.8 ggnewscale_0.4.8
## [7] ggrastr_1.0.1 gghalves_0.1.3
## [9] DESeq2_1.37.6 SummarizedExperiment_1.27.3
## [11] MatrixGenerics_1.9.1 matrixStats_0.62.0
## [13] ggsci_2.9 tidyr_1.2.1
## [15] ggpointdensity_0.1.0 ggforce_0.4.1
## [17] patchwork_1.1.2 ggtext_0.1.2
## [19] forcats_0.5.2 ComplexHeatmap_2.13.1
## [21] viridis_0.6.2 viridisLite_0.4.1
## [23] gridExtra_2.3 ggrepel_0.9.1
## [25] knitr_1.40 kableExtra_1.3.4
## [27] GenomicFeatures_1.49.7 UpSetR_1.4.0
## [29] reshape2_1.4.4 dplyr_1.0.10
## [31] AnnotationDbi_1.59.1 Biobase_2.57.1
## [33] ggplot2_3.3.6 rtracklayer_1.57.0
## [35] GenomicRanges_1.49.1 GenomeInfoDb_1.33.10
## [37] IRanges_2.31.2 S4Vectors_0.35.4
## [39] BiocGenerics_0.43.4 BiocStyle_2.25.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.2.0 RSQLite_2.2.18
## [4] htmlwidgets_1.5.4 BiocParallel_1.31.13 munsell_0.5.0
## [7] codetools_0.2-18 interp_1.1-3 withr_2.5.0
## [10] colorspace_2.0-3 filelock_1.0.2 highr_0.9
## [13] rstudioapi_0.14 rJava_1.0-6 labeling_0.4.2
## [16] GenomeInfoDbData_1.2.9 mixsqp_0.3-43 polyclip_1.10-0
## [19] bit64_4.0.5 farver_2.1.1 vctrs_0.5.1
## [22] generics_0.1.3 xfun_0.33 biovizBase_1.45.0
## [25] BiocFileCache_2.5.2 R6_2.5.1 doParallel_1.0.17
## [28] ggbeeswarm_0.7.0.9000 clue_0.3-61 invgamma_1.1
## [31] locfit_1.5-9.6 AnnotationFilter_1.21.0 bitops_1.0-7
## [34] cachem_1.0.6 DelayedArray_0.23.2 assertthat_0.2.1
## [37] BiocIO_1.7.1 scales_1.2.1 nnet_7.3-18
## [40] beeswarm_0.4.0 gtable_0.3.1 Cairo_1.6-0
## [43] ensembldb_2.21.5 rlang_1.0.6 genefilter_1.79.0
## [46] systemfonts_1.0.4 GlobalOptions_0.1.2 splines_4.2.1
## [49] lazyeval_0.2.2 dichromat_2.0-0.1 checkmate_2.1.0
## [52] BiocManager_1.30.18 yaml_2.3.5 backports_1.4.1
## [55] Hmisc_4.7-1 gridtext_0.1.5 tools_4.2.1
## [58] bookdown_0.29 ellipsis_0.3.2 jquerylib_0.1.4
## [61] RColorBrewer_1.1-3 Rcpp_1.0.9 plyr_1.8.7
## [64] base64enc_0.1-3 progress_1.2.2 zlibbioc_1.43.0
## [67] purrr_0.3.5 RCurl_1.98-1.9 prettyunits_1.1.1
## [70] rpart_4.1.16 deldir_1.0-6 GetoptLong_1.0.5
## [73] ashr_2.2-54 cluster_2.1.4 magrittr_2.0.3
## [76] data.table_1.14.2 circlize_0.4.15 truncnorm_1.0-8
## [79] SQUAREM_2021.1 ProtGenerics_1.29.1 xlsxjars_0.6.1
## [82] hms_1.1.2 evaluate_0.17 xtable_1.8-4
## [85] XML_3.99-0.11 jpeg_0.1-9 shape_1.4.6
## [88] compiler_4.2.1 biomaRt_2.53.3 tibble_3.1.8
## [91] crayon_1.5.2 htmltools_0.5.3 Formula_1.2-4
## [94] geneplotter_1.75.0 lubridate_1.8.0 DBI_1.1.3
## [97] tweenr_2.0.2 dbplyr_2.2.1 MASS_7.3-58.1
## [100] rappdirs_0.3.3 Matrix_1.5-1 cli_3.4.1
## [103] parallel_4.2.1 Gviz_1.41.1 pkgconfig_2.0.3
## [106] GenomicAlignments_1.33.1 foreign_0.8-83 xml2_1.3.3
## [109] foreach_1.5.2 svglite_2.1.0 annotate_1.75.0
## [112] vipor_0.4.5 bslib_0.4.0 webshot_0.5.4
## [115] XVector_0.37.1 rvest_1.0.3 snakecase_0.11.0
## [118] VariantAnnotation_1.43.3 digest_0.6.29 Biostrings_2.65.6
## [121] rmarkdown_2.17 htmlTable_2.4.1 restfulr_0.0.15
## [124] curl_4.3.3 Rsamtools_2.13.4 rjson_0.2.21
## [127] lifecycle_1.0.3 jsonlite_1.8.2 BSgenome_1.65.2
## [130] fansi_1.0.3 pillar_1.8.1 lattice_0.20-45
## [133] KEGGREST_1.37.3 fastmap_1.1.0 httr_1.4.4
## [136] survival_3.4-0 glue_1.6.2 png_0.1-7
## [139] iterators_1.0.14 bit_4.0.4 stringi_1.7.8
## [142] sass_0.4.2 blob_1.2.3 latticeExtra_0.6-30
## [145] memoise_2.0.1 irlba_2.3.5.1